original_world_happiness_2021 <- read.csv("/Users/karissatschida/Documents/2021-2022/Spring 2022/ISyE312/Project/final project/original_world_happiness_2021.csv")
world_happiness_2021 <- original_world_happiness_2021 %>% 
  summarize(country_name, regional_indicator, ladder_score, logged_gdp_per_capita, social_support, healthy_life_expectancy, freedom_to_make_life_choices, generosity, perceptions_of_corruption)

Data

Link to Kaggle Dataset: https://www.kaggle.com/datasets/ajaypalsinghlo/world-happiness-report-2021

Link to World Happiness Report 2021 Webpage: https://worldhappiness.report/ed/2021/

Link to World Happiness Report 2021 PDF: https://happiness-report.s3.amazonaws.com/2021/WHR+21.pdf

Independent Variables vs. Ladder Score

  1. Logged GPD per Capita vs. Ladder Score
plot(world_happiness_2021$ladder_score, world_happiness_2021$logged_gdp_per_capita,
     xlab = "Ladder Score",
     ylab = "Logged GDP per Capita",
     main = "GDP vs. Ladder Score")

  1. Social Support vs. Ladder Score
plot(world_happiness_2021$ladder_score, world_happiness_2021$social_support,
     xlab = "Ladder Score",
     ylab = "Social Support",
     main = "Social Support vs. Ladder Score")

  1. Healthy Life Expectancy vs. Ladder Score
plot(world_happiness_2021$ladder_score, world_happiness_2021$healthy_life_expectancy,
     xlab = "Ladder Score",
     ylab = "Healthy Life Expectancy",
     main = "Healthy Life Expectancy vs. Ladder Score")

  1. Freedom to Make Life Choices vs. Ladder Score
plot(world_happiness_2021$ladder_score, world_happiness_2021$freedom_to_make_life_choices,
     xlab = "Ladder Score",
     ylab = "Freedom to Make Life Choices",
     main = "Freedom to Make Life Choices vs. Ladder Score")

  1. Generosity vs. Ladder Score
plot(world_happiness_2021$ladder_score, world_happiness_2021$generosity,
     xlab = "Ladder Score",
     ylab = "Generosity",
     main = "Generosity vs. Ladder Score")

  1. Perceptions of Corruption vs. Ladder Score
plot(world_happiness_2021$ladder_score, world_happiness_2021$perceptions_of_corruption,
     xlab = "Ladder Score",
     ylab = "Perceptions of Corruption",
     main = "Perceptions of Corruption vs. Ladder Score")

Correlation Plot

num_world_happiness_2021 <- world_happiness_2021 %>% 
  summarize(ladder_score, logged_gdp_per_capita, social_support, healthy_life_expectancy, freedom_to_make_life_choices, generosity, perceptions_of_corruption)
cor(num_world_happiness_2021, method = "pearson", use = "complete.obs")
##                              ladder_score logged_gdp_per_capita social_support
## ladder_score                   1.00000000             0.7897597      0.7568876
## logged_gdp_per_capita          0.78975970             1.0000000      0.7852987
## social_support                 0.75688765             0.7852987      1.0000000
## healthy_life_expectancy        0.76809946             0.8594606      0.7232561
## freedom_to_make_life_choices   0.60775307             0.4323235      0.4829298
## generosity                    -0.01779928            -0.1992864     -0.1149459
## perceptions_of_corruption     -0.42114000            -0.3423374     -0.2032070
##                              healthy_life_expectancy
## ladder_score                               0.7680995
## logged_gdp_per_capita                      0.8594606
## social_support                             0.7232561
## healthy_life_expectancy                    1.0000000
## freedom_to_make_life_choices               0.4614939
## generosity                                -0.1617503
## perceptions_of_corruption                 -0.3643735
##                              freedom_to_make_life_choices  generosity
## ladder_score                                    0.6077531 -0.01779928
## logged_gdp_per_capita                           0.4323235 -0.19928640
## social_support                                  0.4829298 -0.11494585
## healthy_life_expectancy                         0.4614939 -0.16175028
## freedom_to_make_life_choices                    1.0000000  0.16943737
## generosity                                      0.1694374  1.00000000
## perceptions_of_corruption                      -0.4013630 -0.16396173
##                              perceptions_of_corruption
## ladder_score                                -0.4211400
## logged_gdp_per_capita                       -0.3423374
## social_support                              -0.2032070
## healthy_life_expectancy                     -0.3643735
## freedom_to_make_life_choices                -0.4013630
## generosity                                  -0.1639617
## perceptions_of_corruption                    1.0000000
corrplot(cor(num_world_happiness_2021))

Outliers

Data WITH Outliers

happiness_model <- lm(ladder_score ~ regional_indicator + logged_gdp_per_capita + social_support + healthy_life_expectancy + freedom_to_make_life_choices + generosity + perceptions_of_corruption, data = world_happiness_2021)
anova(happiness_model)
## Analysis of Variance Table
## 
## Response: ladder_score
##                               Df  Sum Sq Mean Sq F value    Pr(>F)    
## regional_indicator             9 106.053 11.7837 48.0061 < 2.2e-16 ***
## logged_gdp_per_capita          1  18.245 18.2452 74.3300 1.716e-14 ***
## social_support                 1   4.328  4.3284 17.6338 4.872e-05 ***
## healthy_life_expectancy        1   0.943  0.9432  3.8426   0.05206 .  
## freedom_to_make_life_choices   1   7.511  7.5112 30.6000 1.621e-07 ***
## generosity                     1   0.687  0.6873  2.8002   0.09661 .  
## perceptions_of_corruption      1   0.275  0.2752  1.1212   0.29159    
## Residuals                    133  32.646  0.2455                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
yhat <- happiness_model$fitted.values
res <- happiness_model$residuals
t <- rstudent(happiness_model)
stdres <- rstandard(happiness_model)
par(mfrow=c(1,2))

plot(yhat, res, ylab="Residuals", xlab="Fitted Value", main="Residual vs Fitted")
abline(0, 0)

plot(yhat, t, ylab="Studentized", xlab="Fitted Value", main="Studentized Residual vs Fitted")
abline(0, 0)

par(mfrow=c(1,2))

qqnorm(res, ylab="Residuals", xlab="Normal Scores",main="Residual Q-Q Plot") 
qqline(res)

qqnorm(t, ylab="Studentized Residuals", xlab="Normal Scores",main="Studentized Residual Q-Q Plot") 
qqline(t)

Extract Outliers

Create a linear model.

mod <- lm(ladder_score ~ regional_indicator + logged_gdp_per_capita + social_support + healthy_life_expectancy + freedom_to_make_life_choices + generosity + perceptions_of_corruption, data = world_happiness_2021)

Cook’s Distance

cooksd <- cooks.distance(mod)

Plot Cook’s Distance.

plot(cooksd, 
     xlab = "Index",
     ylab = "Cook's Distance",
     main = "1. Cook's Distance for Influential Observations")

Plot Cook’s Distance using the traditional cut-off point of 4/sample size (or 4/n).

n <- nrow(world_happiness_2021)
plot(cooksd, 
     xlab = "Index",
     ylab = "Cook's Distance",
     main = "2. Cooks Distance for Influential Observations",
     sub = "cut-off point of 4/n represented by blue dashed line")
abline(h = 4/n, lty = 2, col = "blue")

Create a bar plot of Cook’s Distance to display the observations that are influential points of the fitted model.

ols_plot_cooksd_bar(mod)

Create a chart of Cook’s Distance to display the observations that are influential points of the fitted model.

ols_plot_cooksd_chart(mod)

As displayed above by the “2. Cook’s Distance for Influential Observations” plot, the “Cook’s D Bar Plot”, and the “Cook’s D Chart” above, there are 13 influential points in our data.

Remove the 13 outliers in the data set.

influential_obs <- as.numeric(names(cooksd)[(cooksd > 4/n)])

outliers_removed <- world_happiness_2021[-influential_obs, ]

Data WITHOUT Outliers

nooutliers_happiness_model <- lm(ladder_score ~ regional_indicator + logged_gdp_per_capita + social_support + healthy_life_expectancy + freedom_to_make_life_choices + generosity + perceptions_of_corruption, data = outliers_removed)
anova(nooutliers_happiness_model)
## Analysis of Variance Table
## 
## Response: ladder_score
##                               Df Sum Sq Mean Sq F value    Pr(>F)    
## regional_indicator             9 97.179 10.7977 65.7565 < 2.2e-16 ***
## logged_gdp_per_capita          1 14.037 14.0369 85.4823 1.075e-15 ***
## social_support                 1  2.201  2.2013 13.4054 0.0003744 ***
## healthy_life_expectancy        1  0.419  0.4191  2.5524 0.1127593    
## freedom_to_make_life_choices   1  9.105  9.1054 55.4508 1.606e-11 ***
## generosity                     1  1.284  1.2839  7.8188 0.0060234 ** 
## perceptions_of_corruption      1  0.825  0.8254  5.0265 0.0267994 *  
## Residuals                    120 19.705  0.1642                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
yhat <- nooutliers_happiness_model$fitted.values
res <- nooutliers_happiness_model$residuals
t <- rstudent(nooutliers_happiness_model)
stdres <- rstandard(nooutliers_happiness_model)
par(mfrow=c(1,2))

plot(yhat, res, ylab="Residuals", xlab="Fitted Value", main="Residual vs Fitted")
abline(0, 0)

plot(yhat, t, ylab="Studentized", xlab="Fitted Value", main="Studentized Residual vs Fitted")
abline(0, 0)

par(mfrow=c(1,2))

qqnorm(res, ylab="Residuals", xlab="Normal Scores",main="Residual Q-Q Plot") 
qqline(res)

qqnorm(t, ylab="Studentized Residuals", xlab="Normal Scores",main="Studentized Residual Q-Q Plot") 
qqline(t)

13 outliers were removed from the data, resulting in a bit more linear distribution of the residuals on the Q-Q plots.

Questions

Question 1

If at all, does generosity affect ladder score calculations?

ggplot(world_happiness_2021, aes(x = regional_indicator, y = ladder_score)) +
  geom_point(color = "blue", alpha = 0.5, size = 1) +
  theme(axis.text.x = element_text(angle = 90)) +
  xlab("Region") +
  ylab("Ladder Score") +
  ggtitle("Ladder Score vs. Country Region", subtitle = "How does the region of a country affect ladder score?")

ggplot(world_happiness_2021, aes(x = regional_indicator, y = social_support)) +
  geom_point(color = "blue", alpha = 0.5, size = 1) +
  theme(axis.text.x = element_text(angle = 90)) +
  xlab("Region") +
  ylab("Social Support") +
  ggtitle("Social Support vs. Country Region", subtitle = "How does the region of a country affect social support?")

ggplot(world_happiness_2021, aes(x = regional_indicator, y = freedom_to_make_life_choices)) +
  geom_point(color = "blue", alpha = 0.5, size = 1) +
  theme(axis.text.x = element_text(angle = 90)) +
  xlab("Region") +
  ylab("Freedom to Make Life Choices") +
  ggtitle("Freedom to Make Life Choices vs. Country Region", subtitle = "How does the region of a country affect the freedom to make life choices?")

Create a linear model where the dependent variable is the ladder score, and the independent variables are social support and regional indicator.

world_model <-lm(world_happiness_2021$ladder_score ~ world_happiness_2021$social_support + world_happiness_2021$regional_indicator, data = world_happiness_2021)
summary(world_model)
## 
## Call:
## lm(formula = world_happiness_2021$ladder_score ~ world_happiness_2021$social_support + 
##     world_happiness_2021$regional_indicator, data = world_happiness_2021)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7096 -0.2835  0.0285  0.3640  1.4449 
## 
## Coefficients:
##                                                                           Estimate
## (Intercept)                                                                2.16448
## world_happiness_2021$social_support                                        4.30498
## world_happiness_2021$regional_indicatorCommonwealth of Independent States -0.45357
## world_happiness_2021$regional_indicatorEast Asia                          -0.05858
## world_happiness_2021$regional_indicatorLatin America and Caribbean         0.12954
## world_happiness_2021$regional_indicatorMiddle East and North Africa       -0.37856
## world_happiness_2021$regional_indicatorNorth America and ANZ               0.94533
## world_happiness_2021$regional_indicatorSouth Asia                         -0.75086
## world_happiness_2021$regional_indicatorSoutheast Asia                     -0.28844
## world_happiness_2021$regional_indicatorSub-Saharan Africa                 -0.66950
## world_happiness_2021$regional_indicatorWestern Europe                      0.81363
##                                                                           Std. Error
## (Intercept)                                                                  0.55370
## world_happiness_2021$social_support                                          0.60313
## world_happiness_2021$regional_indicatorCommonwealth of Independent States    0.22071
## world_happiness_2021$regional_indicatorEast Asia                             0.27821
## world_happiness_2021$regional_indicatorLatin America and Caribbean           0.19510
## world_happiness_2021$regional_indicatorMiddle East and North Africa          0.20779
## world_happiness_2021$regional_indicatorNorth America and ANZ                 0.32622
## world_happiness_2021$regional_indicatorSouth Asia                            0.28514
## world_happiness_2021$regional_indicatorSoutheast Asia                        0.24448
## world_happiness_2021$regional_indicatorSub-Saharan Africa                    0.20700
## world_happiness_2021$regional_indicatorWestern Europe                        0.19152
##                                                                           t value
## (Intercept)                                                                 3.909
## world_happiness_2021$social_support                                         7.138
## world_happiness_2021$regional_indicatorCommonwealth of Independent States  -2.055
## world_happiness_2021$regional_indicatorEast Asia                           -0.211
## world_happiness_2021$regional_indicatorLatin America and Caribbean          0.664
## world_happiness_2021$regional_indicatorMiddle East and North Africa        -1.822
## world_happiness_2021$regional_indicatorNorth America and ANZ                2.898
## world_happiness_2021$regional_indicatorSouth Asia                          -2.633
## world_happiness_2021$regional_indicatorSoutheast Asia                      -1.180
## world_happiness_2021$regional_indicatorSub-Saharan Africa                  -3.234
## world_happiness_2021$regional_indicatorWestern Europe                       4.248
##                                                                           Pr(>|t|)
## (Intercept)                                                               0.000145
## world_happiness_2021$social_support                                       4.92e-11
## world_happiness_2021$regional_indicatorCommonwealth of Independent States 0.041757
## world_happiness_2021$regional_indicatorEast Asia                          0.833551
## world_happiness_2021$regional_indicatorLatin America and Caribbean        0.507797
## world_happiness_2021$regional_indicatorMiddle East and North Africa       0.070642
## world_happiness_2021$regional_indicatorNorth America and ANZ              0.004372
## world_happiness_2021$regional_indicatorSouth Asia                         0.009421
## world_happiness_2021$regional_indicatorSoutheast Asia                     0.240106
## world_happiness_2021$regional_indicatorSub-Saharan Africa                 0.001526
## world_happiness_2021$regional_indicatorWestern Europe                     3.94e-05
##                                                                              
## (Intercept)                                                               ***
## world_happiness_2021$social_support                                       ***
## world_happiness_2021$regional_indicatorCommonwealth of Independent States *  
## world_happiness_2021$regional_indicatorEast Asia                             
## world_happiness_2021$regional_indicatorLatin America and Caribbean           
## world_happiness_2021$regional_indicatorMiddle East and North Africa       .  
## world_happiness_2021$regional_indicatorNorth America and ANZ              ** 
## world_happiness_2021$regional_indicatorSouth Asia                         ** 
## world_happiness_2021$regional_indicatorSoutheast Asia                        
## world_happiness_2021$regional_indicatorSub-Saharan Africa                 ** 
## world_happiness_2021$regional_indicatorWestern Europe                     ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5849 on 138 degrees of freedom
## Multiple R-squared:  0.7234, Adjusted R-squared:  0.7034 
## F-statistic:  36.1 on 10 and 138 DF,  p-value: < 2.2e-16

Extract the coefficients and R^2 values from the summary to find which region has the highest social support coefficient.

coeffs = coefficients(world_model)
coeffs
##                                                               (Intercept) 
##                                                                2.16447692 
##                                       world_happiness_2021$social_support 
##                                                                4.30497761 
## world_happiness_2021$regional_indicatorCommonwealth of Independent States 
##                                                               -0.45356989 
##                          world_happiness_2021$regional_indicatorEast Asia 
##                                                               -0.05857683 
##        world_happiness_2021$regional_indicatorLatin America and Caribbean 
##                                                                0.12954437 
##       world_happiness_2021$regional_indicatorMiddle East and North Africa 
##                                                               -0.37856495 
##              world_happiness_2021$regional_indicatorNorth America and ANZ 
##                                                                0.94532647 
##                         world_happiness_2021$regional_indicatorSouth Asia 
##                                                               -0.75086403 
##                     world_happiness_2021$regional_indicatorSoutheast Asia 
##                                                               -0.28843800 
##                 world_happiness_2021$regional_indicatorSub-Saharan Africa 
##                                                               -0.66949785 
##                     world_happiness_2021$regional_indicatorWestern Europe 
##                                                                0.81362831
summary(world_model)$r.squared
## [1] 0.7234259

Western Europe has the highest social support coefficient.

To further explore this, test a multi-linear regression model again social support, ladder score, and freedom to make life choices in Western Europe to see how those factor change the ladder score.

weurope <- subset(world_happiness_2021, regional_indicator == "Western Europe")
pairs(~weurope$ladder_score+weurope$social_support+weurope$freedom_to_make_life_choices,main="Simple Scatterplot Matrix")

Create a linear regression model and display results.

weurope_model_generosity <-lm(ladder_score~social_support+freedom_to_make_life_choices+generosity, data=weurope)

summary(weurope_model_generosity)
## 
## Call:
## lm(formula = ladder_score ~ social_support + freedom_to_make_life_choices + 
##     generosity, data = weurope)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6697 -0.1589  0.1042  0.2175  0.4807 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                   -2.0271     1.7891  -1.133  0.27294   
## social_support                 8.1134     2.4618   3.296  0.00427 **
## freedom_to_make_life_choices   1.7743     1.3218   1.342  0.19716   
## generosity                     0.3631     0.7601   0.478  0.63895   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3668 on 17 degrees of freedom
## Multiple R-squared:  0.7347, Adjusted R-squared:  0.6878 
## F-statistic: 15.69 on 3 and 17 DF,  p-value: 3.792e-05

Test if generosity is significant or insignificant via R^2.

First, examine an ANOVA table.

anova(weurope_model_generosity)
## Analysis of Variance Table
## 
## Response: ladder_score
##                              Df Sum Sq Mean Sq F value    Pr(>F)    
## social_support                1 5.9202  5.9202 44.0015 4.229e-06 ***
## freedom_to_make_life_choices  1 0.3822  0.3822  2.8409    0.1102    
## generosity                    1 0.0307  0.0307  0.2282    0.6390    
## Residuals                    17 2.2873  0.1345                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Next, calculate R^2 and the adjusted R^2 values using the values from the ANOVA table.

R^2 = 1 - (SS(Res) / SS(T))

adjusted R^2 = 1 - SS(Res)/(n-p) / SS(T)/(n-1)

rsq1 <- 1 - ((2.2873) / (5.92+0.3822+0.0307+2.2873))
rsq1
## [1] 0.7346581
adj_rsq1 <- 1 - (2.2873 / 17) / ((5.92+0.3822+0.0307+2.2873) / 20)
adj_rsq1
## [1] 0.6878331

Finally, test for the significance of regression coefficient (x3 = generosity) via a hypothesis test.

H0: B3 = 0

H1: B3 <=> 0 (testing whether B3 is insignificant in regression equation)

to = B3 / se(B3) = 0.3631 / sqrt(sigmsq*vif(generosity))

sigmasq <- (sigma(weurope_model_generosity))^2
sigmasq
## [1] 0.1345448
vif <- vif(weurope_model_generosity)
vif
##               social_support freedom_to_make_life_choices 
##                     2.120312                     2.524734 
##                   generosity 
##                     1.570089
t0 <- 0.3631 / (sqrt(0.1345448 * 1.570089))
t0
## [1] 0.7900065

t_(alpha/2, n-3) = t_(0.005, 17) = 2.898

t0 = 0.7900065

Since t0 < t_(alpha/2, n-3), we fail to reject H0. Therefore, there is not any strong evidence that there is a correlation between generosity and ladder score.

Question 2

How do social support and freedom to make life choices affect the ladder score in Western European countries?

num_weurope <- weurope %>% 
  summarize(ladder_score, logged_gdp_per_capita, social_support, healthy_life_expectancy, freedom_to_make_life_choices, generosity, perceptions_of_corruption)

Correlation plot of each of the individual variables for Western European regions.

cor(num_weurope, method = "pearson", use = "complete.obs")
##                              ladder_score logged_gdp_per_capita social_support
## ladder_score                    1.0000000             0.6797150      0.8287136
## logged_gdp_per_capita           0.6797150             1.0000000      0.5049771
## social_support                  0.8287136             0.5049771      1.0000000
## healthy_life_expectancy        -0.2763497            -0.1704001     -0.1964397
## freedom_to_make_life_choices    0.7452482             0.5734928      0.7240141
## generosity                      0.5246678             0.4215327      0.4846631
## perceptions_of_corruption      -0.8091182            -0.6570477     -0.6256685
##                              healthy_life_expectancy
## ladder_score                              -0.2763497
## logged_gdp_per_capita                     -0.1704001
## social_support                            -0.1964397
## healthy_life_expectancy                    1.0000000
## freedom_to_make_life_choices              -0.2962064
## generosity                                -0.1146481
## perceptions_of_corruption                  0.2872368
##                              freedom_to_make_life_choices generosity
## ladder_score                                    0.7452482  0.5246678
## logged_gdp_per_capita                           0.5734928  0.4215327
## social_support                                  0.7240141  0.4846631
## healthy_life_expectancy                        -0.2962064 -0.1146481
## freedom_to_make_life_choices                    1.0000000  0.5978758
## generosity                                      0.5978758  1.0000000
## perceptions_of_corruption                      -0.7032823 -0.4755846
##                              perceptions_of_corruption
## ladder_score                                -0.8091182
## logged_gdp_per_capita                       -0.6570477
## social_support                              -0.6256685
## healthy_life_expectancy                      0.2872368
## freedom_to_make_life_choices                -0.7032823
## generosity                                  -0.4755846
## perceptions_of_corruption                    1.0000000
corrplot(cor(num_weurope))

pairs(~ weurope$ladder_score + weurope$social_support + weurope$freedom_to_make_life_choices, main="Simple Scatterplot Matrix")

weurope_model <-lm(ladder_score~social_support + freedom_to_make_life_choices, data = weurope)
summary(weurope_model)
## 
## Call:
## lm(formula = ladder_score ~ social_support + freedom_to_make_life_choices, 
##     data = weurope)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7618 -0.1386  0.1121  0.2124  0.4276 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                    -2.351      1.620  -1.451  0.16389   
## social_support                  8.224      2.398   3.430  0.00299 **
## freedom_to_make_life_choices    2.033      1.180   1.723  0.10206   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3589 on 18 degrees of freedom
## Multiple R-squared:  0.7311, Adjusted R-squared:  0.7012 
## F-statistic: 24.47 on 2 and 18 DF,  p-value: 7.349e-06

First, examine an ANOVA table.

anova(weurope_model)
## Analysis of Variance Table
## 
## Response: ladder_score
##                              Df Sum Sq Mean Sq F value    Pr(>F)    
## social_support                1 5.9202  5.9202 45.9727 2.373e-06 ***
## freedom_to_make_life_choices  1 0.3822  0.3822  2.9681    0.1021    
## Residuals                    18 2.3180  0.1288                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

R^2 = 1 - (SS(Res) / SS(T))

adjusted R^2 = 1 - SS(Res)/(n-p) / SS(T)/(n-1)

rsq2 <- 1 - ((2.3180) / (5.92+0.3822+2.3180))
rsq2
## [1] 0.7310967
adj_rsq2 <- 1 - (2.3180 / 18) / ((5.92+0.3822+2.3180) / 20)
adj_rsq2
## [1] 0.7012186

The estimated regression function is y = 8.224X1 + 2.033X2 - 2.351. B1 = 8.224 which means for every one unit increase in a persons rating of the social_support, their ladder score in Western Europe increases by 8.224.

Ho = B1 = B2

Ha = Bj <=> 0 for at least one j in J={1,2}

Testing the regression relation, using alpha = 0.01, our test shows that the p-value is much less than 0.01, we reject the null hypothesis that B1 = B2, so there is a significant relationship between the variables in the linear regression model. Our test implies that there is a linear relationship between B1 and B2. Which we would expect since these are factors in calculating the ladder score.

Question 3

How does the generosity of a country affect their perceptions of corruption?

To see how the generosity of a country affect their perceptions of corruption, we will perform a simple linear regression analysis where the independent variable is generosity (X) and the perception of corruption is the dependent variable (Y).

genper_model <-lm(perceptions_of_corruption ~ generosity, data = world_happiness_2021)
summary(genper_model)
## 
## Call:
## lm(formula = perceptions_of_corruption ~ generosity, data = world_happiness_2021)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.64601 -0.05258  0.04683  0.11089  0.24822 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.72450    0.01461  49.600   <2e-16 ***
## generosity  -0.19505    0.09679  -2.015   0.0457 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1774 on 147 degrees of freedom
## Multiple R-squared:  0.02688,    Adjusted R-squared:  0.02026 
## F-statistic: 4.061 on 1 and 147 DF,  p-value: 0.04571

y = -0.19505x + 0.72450

First, we will conduct a t test to determine whether or not there is a linear association between the two variables here as the alpha risk value of 0.10.

alpha <- 0.1
y <- length(world_happiness_2021$perceptions_of_corruption)
x <- length(unique(world_happiness_2021$generosity))

F.test.world_happiness_2021 <- qf(1 - alpha, x - 1, y - x)
F.test.world_happiness_2021
## [1] 1.663497
e_df <- world_happiness_2021 %>% 
  mutate(y_i = perceptions_of_corruption) %>% 
  mutate(x = generosity) %>% 
  mutate(y_hat = -0.19505*x + 0.72450) %>% 
  mutate(e_i = y_i - y_hat) %>% 
  summarize(x, y_i, y_hat, e_i)
e_i_sqrd <- e_df$e_i^2

sum_residuals <- sum(e_i_sqrd)
sum_residuals
## [1] 4.626261
S_xx = (sum((world_happiness_2021$generosity)^2)) - (((sum(world_happiness_2021$generosity))^2) / (149-2))

se_B1 = sqrt((sum_residuals / (149-2)) / (S_xx))
se_B1
## [1] 0.09679816

Testing Significance of Regression

Alternatives:

Ho: β1 = 0

Ha: β1 <=> 0

t_0 = (-0.19505 + 0) / (0.09679816)
t_0
## [1] -2.015018

t_(alpha/2, n-2) = t_(0.005, 147) = 1.655285

Since |-2.015018| > 1.655285, we reject the null hypothesis, and can we therefore say that there is a linear relationship between generosity and perceptions of cor

As shown in the summary of the genper_model model above, the β0 value 0.72450, which means that when generosity (x) is 0, the perception of corruption (y) is expected to be 0.72450. We also see that the slope, β1, is -0.19505. This means that the perception of corruption (y) is expected to decrease by -0.19505 for each 1 unit of increase in generosity (x).

Question 4

For Western Europe, Latin America & the Caribbean, and Sub-Saharan Africa, which factor has the highest impact on the ladder score of the region?

3 different dfs -> instead of generosity change to factors we want look at, look at beta values to understand correlation

weurope <- subset(world_happiness_2021, regional_indicator == "Western Europe")
weurope_all_model <-lm(ladder_score ~ logged_gdp_per_capita + social_support + healthy_life_expectancy + freedom_to_make_life_choices + generosity + perceptions_of_corruption, data = weurope)

summary(weurope_all_model)
## 
## Call:
## lm(formula = ladder_score ~ logged_gdp_per_capita + social_support + 
##     healthy_life_expectancy + freedom_to_make_life_choices + 
##     generosity + perceptions_of_corruption, data = weurope)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5306 -0.1438 -0.0051  0.1730  0.5004 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                  -0.08296    7.64206  -0.011  0.99149   
## logged_gdp_per_capita         0.37140    0.30832   1.205  0.24833   
## social_support                6.57583    2.13144   3.085  0.00807 **
## healthy_life_expectancy      -0.03521    0.09279  -0.379  0.71008   
## freedom_to_make_life_choices  0.07100    1.24412   0.057  0.95530   
## generosity                    0.17723    0.64510   0.275  0.78754   
## perceptions_of_corruption    -1.00238    0.47709  -2.101  0.05423 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3082 on 14 degrees of freedom
## Multiple R-squared:  0.8458, Adjusted R-squared:  0.7797 
## F-statistic:  12.8 on 6 and 14 DF,  p-value: 5.594e-05

Western Europe

  • Logged GDP per Capita INCREASES by 1; Ladder Score INCREASES by 0.37140

  • Social Support INCREASES by 1; Ladder Score INCREASES by 6.57583

  • Healthy Life Expectancy INCREASES by 1; Ladder Score DECREASES by 0.03521

  • Freedom to Make Life Choices INCREASES by 1; Ladder Score INCREASES by 0.07100

  • Generosity INCREASES by 1; Ladder Score INCREASES by 0.17723

  • Perceptions of Corruption INCREASES by 1; Ladder Score DECREASES by 1.00238

In Western Europe, the Social Score is what has the largest impact on the ladder score.

latamerica <- subset(world_happiness_2021, regional_indicator == "Latin America and Caribbean")
latamerica_all_model <-lm(ladder_score ~ logged_gdp_per_capita + social_support + healthy_life_expectancy + freedom_to_make_life_choices + generosity + perceptions_of_corruption, data = latamerica)

summary(latamerica_all_model)
## 
## Call:
## lm(formula = ladder_score ~ logged_gdp_per_capita + social_support + 
##     healthy_life_expectancy + freedom_to_make_life_choices + 
##     generosity + perceptions_of_corruption, data = latamerica)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3983 -0.1434 -0.0241  0.1424  0.5490 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -5.49714    2.60454  -2.111 0.054749 .  
## logged_gdp_per_capita         0.02715    0.20799   0.131 0.898157    
## social_support                0.48564    1.84805   0.263 0.796836    
## healthy_life_expectancy       0.11484    0.04830   2.378 0.033452 *  
## freedom_to_make_life_choices  4.13883    0.94984   4.357 0.000776 ***
## generosity                    0.38819    0.94989   0.409 0.689438    
## perceptions_of_corruption    -0.47465    0.90271  -0.526 0.607874    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3006 on 13 degrees of freedom
## Multiple R-squared:  0.8714, Adjusted R-squared:  0.8121 
## F-statistic: 14.69 on 6 and 13 DF,  p-value: 4.078e-05

Latin America & Caribbean

  • Logged GDP per Capita INCREASES by 1; Ladder Score INCREASES by 0.02715

  • Social Support INCREASES by 1; Ladder Score INCREASES by 0.48564

  • Healthy Life Expectancy INCREASES by 1; Ladder Score INCREASES by 0.11484

  • Freedom to Make Life Choices INCREASES by 1; Ladder Score INCREASES by 4.13883

  • Generosity INCREASES by 1; Ladder Score INCREASES by 0.38819

  • Perceptions of Corruption INCREASES by 1; Ladder Score DECREASES by 0.47465

In Latin America & Caribbean, the Freedom to Make Life Choices is what has the largest impact on the ladder score.

ssafrica <- subset(world_happiness_2021, regional_indicator == "Sub-Saharan Africa")
ssafrica_all_model <-lm(ladder_score ~ logged_gdp_per_capita + social_support + healthy_life_expectancy + freedom_to_make_life_choices + generosity + perceptions_of_corruption, data = ssafrica)

summary(ssafrica_all_model)
## 
## Call:
## lm(formula = ladder_score ~ logged_gdp_per_capita + social_support + 
##     healthy_life_expectancy + freedom_to_make_life_choices + 
##     generosity + perceptions_of_corruption, data = ssafrica)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.31892 -0.29280  0.03784  0.43805  0.98049 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)
## (Intercept)                   0.246384   2.074988   0.119    0.906
## logged_gdp_per_capita         0.343159   0.211825   1.620    0.116
## social_support               -1.087070   1.731823  -0.628    0.535
## healthy_life_expectancy       0.007795   0.033919   0.230    0.820
## freedom_to_make_life_choices  0.757828   1.453040   0.522    0.606
## generosity                    1.382039   1.006261   1.373    0.180
## perceptions_of_corruption     1.608688   1.148309   1.401    0.172
## 
## Residual standard error: 0.6356 on 29 degrees of freedom
## Multiple R-squared:  0.2196, Adjusted R-squared:  0.05817 
## F-statistic:  1.36 on 6 and 29 DF,  p-value: 0.2636

Sub-Saharan Africa

  • Logged GDP per Capita INCREASES by 1; Ladder Score INCREASES by 0.343159

  • Social Support INCREASES by 1; Ladder Score DECREASES by 1.087070

  • Healthy Life Expectancy INCREASES by 1; Ladder Score INCREASES by 0.007795

  • Freedom to Make Life Choices INCREASES by 1; Ladder Score INCREASES by 0.757828

  • Generosity INCREASES by 1; Ladder Score INCREASES by 1.382039

  • Perceptions of Corruption INCREASES by 1; Ladder Score INCREASES by 1.608688

In Sub-Saharan Africa, the Perceptions of Corruption is what has the largest impact on the ladder score.

Question 5

Which region of the world has the highest ladder score?

ladder_model <-lm(ladder_score ~ social_support + freedom_to_make_life_choices, data = world_happiness_2021)

summary(ladder_model)
## 
## Call:
## lm(formula = ladder_score ~ social_support + freedom_to_make_life_choices, 
##     data = world_happiness_2021)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9892 -0.3735  0.0280  0.4863  1.4558 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   -1.4393     0.4334  -3.321  0.00113 ** 
## social_support                 5.6489     0.5231  10.799  < 2e-16 ***
## freedom_to_make_life_choices   2.9935     0.5303   5.645 8.33e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6402 on 146 degrees of freedom
## Multiple R-squared:  0.6494, Adjusted R-squared:  0.6446 
## F-statistic: 135.2 on 2 and 146 DF,  p-value: < 2.2e-16
anova(ladder_model)
## Analysis of Variance Table
## 
## Response: ladder_score
##                               Df Sum Sq Mean Sq F value    Pr(>F)    
## social_support                 1 97.785  97.785 238.564 < 2.2e-16 ***
## freedom_to_make_life_choices   1 13.061  13.061  31.866 8.334e-08 ***
## Residuals                    146 59.844   0.410                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
yhat <- ladder_model$fitted.values
res <- ladder_model$residuals
t <- rstudent(ladder_model)
stdres <- rstandard(ladder_model)
par(mfrow=c(1,2))

plot(yhat, res, ylab="Residuals", xlab="Fitted Value", main="Residual vs Fitted")
abline(0, 0)

plot(yhat, t, ylab="Studentized", xlab="Fitted Value", main="Studentized Residual vs Fitted")
abline(0, 0)

par(mfrow=c(1,2))

qqnorm(res, ylab="Residuals", xlab="Normal Scores",main="Residual Q-Q Plot") 
qqline(res)

qqnorm(t, ylab="Studentized Residuals", xlab="Normal Scores",main="Studentized Residual Q-Q Plot") 
qqline(t)

# which country has the highest ladder score?
country_ladder_score <- world_happiness_2021 %>% 
  group_by(country_name) %>% 
  summarize(country_name, ladder_score) %>% 
  arrange(desc(ladder_score)) %>% 
  distinct()

head(country_ladder_score, 1)
## # A tibble: 1 × 2
##   country_name ladder_score
##   <chr>               <dbl>
## 1 Finland              7.84
ggplot(world_happiness_2021, aes(x = fct_inorder(country_name), y = ladder_score)) +
  geom_point(color = "blue", size = 1) +
  scale_x_discrete(guide = guide_axis(angle = 90)) + 
  theme(axis.text.x = element_text(size = 4)) +
  xlab("Country") +
  ylab("Ladder Score") +
  ggtitle("Ladder Score of Each Country") 

Next, we will examine the average ladder score of the ten regions.

avg_region <- world_happiness_2021 %>% 
  group_by(regional_indicator) %>% 
  mutate(avg_ladder_score = sum(ladder_score) / n()) %>% 
  mutate(avg_social_support = sum(social_support) / n()) %>% 
  mutate(avg_ftmlc = sum(freedom_to_make_life_choices) / n()) %>% 
  summarize(regional_indicator, avg_ladder_score, avg_social_support, avg_ftmlc) %>% 
  arrange(desc(avg_ladder_score)) %>% 
  distinct()
## `summarise()` has grouped output by 'regional_indicator'. You can override using
## the `.groups` argument.
regions_model <-lm(avg_ladder_score ~ avg_social_support + avg_ftmlc, data = avg_region)

summary(regions_model)
## 
## Call:
## lm(formula = avg_ladder_score ~ avg_social_support + avg_ftmlc, 
##     data = avg_region)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5989 -0.1561  0.0303  0.2016  0.3885 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)   
## (Intercept)          -3.423      1.451  -2.359  0.05039 . 
## avg_social_support    9.373      1.849   5.069  0.00145 **
## avg_ftmlc             1.605      2.210   0.726  0.49125   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3481 on 7 degrees of freedom
## Multiple R-squared:  0.8798, Adjusted R-squared:  0.8454 
## F-statistic: 25.61 on 2 and 7 DF,  p-value: 0.0006026
anova(regions_model)
## Analysis of Variance Table
## 
## Response: avg_ladder_score
##                    Df Sum Sq Mean Sq F value    Pr(>F)    
## avg_social_support  1 6.1428  6.1428 50.6956 0.0001903 ***
## avg_ftmlc           1 0.0639  0.0639  0.5274 0.4912508    
## Residuals           7 0.8482  0.1212                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
yhat <- regions_model$fitted.values
res <- regions_model$residuals
t <- rstudent(regions_model)
stdres <- rstandard(regions_model)
par(mfrow=c(1,2))

plot(yhat, res, ylab="Residuals", xlab="Fitted Value", main="Residual vs Fitted")
abline(0, 0)

plot(yhat, t, ylab="Studentized", xlab="Fitted Value", main="Studentized Residual vs Fitted")
abline(0, 0)

par(mfrow=c(1,2))

qqnorm(res, ylab="Residuals", xlab="Normal Scores",main="Residual Q-Q Plot") 
qqline(res)

qqnorm(t, ylab="Studentized Residuals", xlab="Normal Scores",main="Studentized Residual Q-Q Plot") 
qqline(t)

# which region has the highest average ladder score?
head(avg_region, 1)
## # A tibble: 1 × 4
## # Groups:   regional_indicator [1]
##   regional_indicator    avg_ladder_score avg_social_support avg_ftmlc
##   <chr>                            <dbl>              <dbl>     <dbl>
## 1 North America and ANZ             7.13              0.934     0.899
ggplot(avg_region, aes(x = fct_inorder(regional_indicator), y = avg_ladder_score)) +
  geom_point(color = "blue") +
  scale_x_discrete(guide = guide_axis(angle = 60)) +
  xlab("Regions") +
  ylab("Average Ladder Score") +
  ggtitle("Average Ladder Score of Each Region")

Conclusions

  1. Generosity was not a factor into ladder score calculations for our data set.

  2. There is not one single factor that impacts ladder score most.

  3. The region that has the highest ladder score is North America and ANZ and the country with the highest ladder score is Finland.